** Clean CATS-R-FRF.dta file!

clear
set more off

use "${raw}/nlrb/CATS-R-FRF.dta",clear
drop if inlist(unit_loc_state,"Guam","Puerto Rico","Virgin Islands")

*------------------------------------------------------------------------------*
* Drop duplicates created by employer address(es):
drop employer* 
duplicates drop _all, force // Drops 310,182 observations (Iain - 310,332)

*------------------------------------------------------------------------------*
** Check types of cases, drop UD cases (deauthorization of union requirement)
g casenumber=r_case_number
gen casetype = substr(casenumber,4,2)
tab casetype
drop if casetype=="UD"
count // 29560 (Iain - 29578)

*------------------------------------------------------------------------------*
** Check status of cases, keep only closed cases

/* Iain - Check if there are different status outcomes by case number 
preserve
	duplicates drop status casenumber,force
	bys casenumber: g multistatus=_N // All 1's so no problems here
restore
 */

duplicates tag casenumber status,g(dups1) // 5406 observations with duplicate status
tab dups1

keep if status==1 //drops 269
drop status
drop dups1

*------------------------------------------------------------------------------*
/* Iain - There are duplicate outcomes for a casenumber unit-id. If you drop
cases that do not end in particular outcomes you may be dropping the true outcome 
and retaining a false one. */

/* It seems like the closing method variable has been assigned to past actions rather 
than the final action establishing the outcome */

/* I create from the raw file R_CLOSED_CASE the final outcome for each case*/

ren closing_method method_closed
merge m:1 r_case_number using "${temp}/closed_cases_clean.dta"

// Deal with Merged Datset
/*
preserve
	keep if _merge==1
	keep if r_case_number=="12-RC-09281"	
	g file_number=100
	save ${temp}\unmergers.dta,replace // one lonely observation here
restore
*/

// Limited number tricky to match method closed as multiple closing methods
// These are in closed_clean file - just multiple observations
replace closing_method = method_closed if _merge == 1
replace closing_date = date_closed if _merge == 1
keep if _merge == 1 | _merge == 3
drop _merge

drop method_closed date_closed
duplicates drop _all,force // 128 deleted


*------------------------------------------------------------------------------*
* Identify duplicates based on Tally Date and Tally Type

// We still have duplicates
duplicates tag r_case_number unit_id,g(dups1)
sort dups1 r_case_number unit_id

drop dups1

// The problem here is that multiple elections (tallies and reruns etc) are all recorded
// Keep most recent election
bys r_case_number unit_id: egen recent=max(tally_ballot_date)
format recent %td
keep if tally_ballot_date==recent // 789 observations deleted
drop recent

// Identify cases where tally type differs but otherwise identical
duplicates drop _all,force
ds tally_type,not
duplicates drop `r(varlist)',force // 7 observations dropped

// Keep most recent election type when dates overlap
bys r_case_number unit_id: egen recent=max(tally_type)
g flag = (tally_type!=recent)
bys r_case_number unit_id: egen check = total(flag)

// In some cases the recount affirms the result but does not report the count
// In this case retain old count
drop if check>0 & tally_type == 6 & num_valid_votes == 0 // 4 observations

// In other cases retain the most recent tally type
drop flag recent check
bys r_case_number unit_id: egen recent=max(tally_type)
g flag = (tally_type!=recent)
bys r_case_number unit_id: egen check = total(flag)

drop if check & flag //  14 observations
drop check flag

duplicates tag r_case_number unit_id,g(dups1)
tab dups1

preserve
	drop if dups>0
	drop dups
	save "${temp}/non_dups.dta",replace
restore

save "${temp}/all_data.dta",replace

*------------------------------------------------------------------------------*
/* Single Union Duplicates */

keep if dups>0
keep if multiunion==0 // 116 observations remaining for single union elections

// Some instances have same information but are missing votes for and against
drop if num_votes_for==. & dups>0 // 24 dropped

// Some duplicate obvservations with same information but a zero votes for count
drop if dups>0 & num_votes_for == 0 & num_votes_against< num_valid_votes //  8 dropped

// Some observations are identical except for the petitioner
drop petitioner
duplicates drop _all,force //  6 observations

// Some specific instances where the votes for is split across two observations
bys r_case_number unit_id: egen av_against = mean(num_votes_against)
bys r_case_number unit_id: egen total_for = total(num_votes_for)
replace num_votes_for = total_for if dups>0 & av_against == num_votes_against & (total_for + num_votes_against == num_valid_votes)

duplicates drop _all,force //  3 observations

drop av_against total_for recent

// Specific cases
// In raw data it would appear that 04-RM-01267 was a multi-union election between two unions
// What matters is that a union was chosen and 72 voted for at least one union
replace num_votes_against=0 if r_case_number=="04-RM-01267" 
replace num_votes_for=72 if r_case_number=="04-RM-01267" 
drop if r_case_number=="04-RM-01267"  & election_results == 0

// For "12-RM-00382" this is also the case, with 6 voting for neither
// What is relevant here is that a union was voted on by all but 6, so we retain the "WIN" result
drop if r_case_number=="12-RM-00382"  & election_results == 0


// For remaining cases keep only certified election in R_ELECT_CERTIFICATION
* Election id: 22-00839 associated with 22-RC-12542 and the instance with 582 valid votes
* Election id: 34-00277 associated with 34-RC-02068 and the instance with 102 valid votes
* Election id: 19-00762 associated with 19-RC-14621 and the instance with 3 valid votes
* Election id: 17-00811 associated with 17-RC-12434 and the instance with 1 vote for
* Election id: 16-00093 associated with 16-RC-10152 and the instance with 13 votes for
* Election id: 08-00176 associated with 08-RC-15963 and the instance with 20 votes for
* Election id: 05-00914 associated with 05-RC-15963 and the instance with 3 votes for
drop if r_case_number=="22-RC-12542" & num_valid_votes!=582
drop if r_case_number=="34-RC-02068" & num_valid_votes!=102
drop if r_case_number=="19-RC-14621" & num_valid_votes!=3
drop if r_case_number=="17-RC-12434" & num_votes_for!=1
drop if r_case_number=="16-RC-10152" & num_votes_for!=13
drop if r_case_number=="08-RC-15963" & num_votes_for!=20
drop if r_case_number=="05-RC-15963" & num_votes_for!=3


// For remaining cases we have both "Rep" and "Resul" closing methods
// But results of election are the same
// We retain both of these types of election outcomes but we do need to distinguish between them
// We do not want to double count so we keep based on election certification outcome
drop if r_case_number == "06-RC-12649" & closing_method == 4
drop if r_case_number == "13-RC-20509" & closing_method == 4
drop if r_case_number == "21-RC-20105" & closing_method != 5
drop if r_case_number == "31-RC-08296" & closing_method == 4
drop if r_case_number == "36-RC-05979" & closing_method == 4
drop if r_case_number == "36-RD-01720" & closing_method == 4


// Verify no duplicates remaining
drop dups1
duplicates tag r_case_number unit_id,g(dups1)
tab dups1
drop dups1

save "${temp}/single_dups.dta",replace



*------------------------------------------------------------------------------*
/* Multi Union Duplicates */

use "${temp}/all_data.dta",clear
drop if dups1==0
keep if multiunion==1 // 2672 observations remaining

// First deal with cases where num_votes_for==.
count if num_votes_for==.
g flag = (num_votes_for+num_votes_against == num_valid_votes)
bys r_case_number unit_id: egen check = max(flag)
replace num_votes_for=0 if num_votes_for==. & check==1 // 8/10 cases resolved
drop flag check dups

// For remaining cases can verify manually they should be set to 0
g flag = (num_votes_for == .)
bys r_case_number unit_id: egen check = max(flag)
sort check r_case_number
replace num_votes_for = 0 if num_votes_for == .
drop flag check

// Missing zeroes for votes against
count if num_votes_against==.
bys r_case_number unit_id: egen total_for = total(num_votes_for)
g flag = (total_for == num_valid_votes)
bys r_case_number unit_id: egen check = max(flag)
replace num_votes_against=0 if num_votes_against==. & check==1 // 51/54 cases resolved

// For remaining case
replace num_votes_against = num_valid_votes - total_for if num_votes_against==. 
drop flag check total_for


// Retain multiunion elections with valid counts
bys r_case_number unit_id: egen votesfor=total(num_votes_for)
g checker=1 if votesfor+num_votes_against==num_valid_votes
bys r_case_number unit_id: egen check2=total(checker)
bys r_case_number unit_id: g num=_N
g checker3=1 if check2/num==1

preserve
	keep if checker3==1
	// Identify elections where any union won and create tally of votes for any union
	replace num_votes_for=votesfor
	replace election_results=1 if num_votes_for>=num_votes_against
	replace election_results=0 if num_votes_for<num_votes_against
	drop check* votesfor num
	duplicates drop _all,force // 1321 observations dropped

	duplicates tag r_case_number unit_id,g(dups1)
	tab dups1 // no duplicates
	drop dups1

	save "${temp}/multi_dups.dta",replace
restore


drop if checker3 == 1 //  76 observations remaining

// Individual Cases
// These are verified using Certification, Election Tally, and Votes for Data sets

// For certain cases we have both "Rep" and "Resul" closing methods
// But results of election are the same
// We retain both of these types of election outcomes but we do need to distinguish between them
// We do not want to double count so arbitrarily keep one
drop if r_case_number == "22-RM-00735" & closing_method == 4


// 09-RC-17363
// Challenges in first tally are counted in second tally but only result of the challenges is reported, not the total votes
replace num_votes_for = num_votes_for + 297 if num_votes_for == 82 & r_case_number == "09-RC-17363"
replace num_votes_for = num_votes_for + 289 if num_votes_for == 13 & r_case_number == "09-RC-17363"

drop if r_case_number == "07-RC-20558" & num_votes_for == 0

// Retain certified election only
* Election id: 31-00721 associated with 31-RC-08467 - drop instance with 41 for votes which is from non-certified election
drop if r_case_number == "31-RC-08467" & num_votes_for == 41

// Votes of 5 and 6 appear to be associated with the first election rather than the re-run
drop if r_case_number == "01-RC-21523" & num_votes_for == 5
drop if r_case_number == "01-RC-21523" & num_votes_for == 6

// Assumed error in votes against (can verify that union wins certified election)
replace num_votes_against = 8 if r_case_number == "03-RC-11901"

// Missing union with 1 vote for
expand 2 if r_case_number == "05-RC-16011" & num_votes_for == 1
expand 2 if r_case_number == "29-RC-10289" & num_votes_for == 3

// Unclear why count off by 1
drop if r_case_number == "05-RC-16347" & num_votes_for == 1

// Suspect missing votes against - 39 against in first round, dissapear for unclear reasons in second round but votes for does not change
replace num_votes_against = 39 if r_case_number == "13-RC-20264"

// discrepency likely due to 5 challenged votes
replace num_valid_votes = 92 if r_case_number == "13-RC-21348"

// Suspect that one observation is from early round election
drop if r_case_number == "13-RD-02399" & num_votes_for == 66

// unclear why there are votes for two unions but can verify that one union won election with all 12 votes
drop if r_case_number == "19-RC-15232" & election_results == 0


// Size of discrepency with counts
bys r_case_number unit_id: egen total_for = total(num_votes_for)
g diff = total_for + num_votes_against - num_valid_votes


// Not clear why valid votes is missing
replace num_valid_votes = diff if r_case_number == "20-RD-02404"
replace num_valid_votes = diff if r_case_number == "22-RC-12595"
replace num_valid_votes = diff if r_case_number == "22-RC-12657"

// Appears to be due to challenged votes but election is certified with winning union
replace num_valid_votes = diff if r_case_number == "22-RC-11968"
replace num_valid_votes = diff+num_valid_votes  if r_case_number == "31-RM-01275"

// Appears to be due to type in eligible votes
replace num_valid_votes = num_elig_employees if r_case_number == "28-RC-06697"

// Unclear error but votes for seem valid - appears to be an issue between units - unclear why number of eligible votes is lower only for third election
replace num_valid_votes = diff+num_valid_votes  if r_case_number == "32-RC-05234"

drop diff
g diff = total_for + num_votes_against - num_valid_votes

// For remaining cases adjust the number of valid votes to account for the discrepency
replace num_valid_votes = diff+num_valid_votes  if r_case_number != "03-RM-00789"

// Case where we cannot determine votes against and why election is lost
drop if r_case_number == "03-RM-00789"

drop diff

// Collapse the multi-unit elections 
collapse (rawsum) num_votes_for (mean) num_elig* num_val* num_votes_against (max) election_results ///
	, by(r_case_number unit_id case_name *date* naics* bar* tally* unit_loc* petitioner case* clos*)

append using "${temp}/multi_dups.dta"
save "${temp}/multi_dups.dta",replace





*------------------------------------------------------------------------------*
// Merge data and check how many cases have been dropped

clear
append using  "${temp}/non_dups.dta" ///
	"${temp}/single_dups.dta" ///
	"${temp}/multi_dups.dta" 
	
save "${temp}/nlrb9910_pt1.dta",replace




